# pacman::p_load(here,
# sf, tidyverse, spdep,
# funModeling, tmap, ggpubr,
# corrplot, heatmaply, cluster, ClustGeo, factoextra, GGally,
# blorr, skimr, caret, GWmodel,report
# )In Class Ex5 - Spatially Weighted Regression
Overview
Packages used:
- here: used to generate the file path to a specific folder
- tidyverse
- readr: for reading rectangular files
- sf: Used to handle spatial dataframes
- funModeling: Used for plotting and EDA
- tmap: Use for plotting geo spatial maps
- skimr: provides summary statistics about variables in the dataframe
- corrplot: Used for performing correlation analysis
- blorr: Used to generate a report for our logistic regression model
- caret: Package for confusion matrix
Step 1: Load the required packages
We load the packages as follow:
pacman::p_load(here, sf, tidyverse, funModeling, tmap,
skimr, corrplot, blorr, GWmodel, caret
)We create the boundary path with the here() function:
boundary_path <- here("data", "dataOsun", "Osun.rds")
boundary_path[1] "D:/f4sared/ISSS624/data/dataOsun/Osun.rds"
We create the path to the water point data:
data_path <- here("data", "dataOsun", "Osun_wp_sf.rds")
data_path[1] "D:/f4sared/ISSS624/data/dataOsun/Osun_wp_sf.rds"
Using read_rds() from readr (tidyverse):
osun <- read_rds(boundary_path)Using read_rds() from readr (tidyverse):
osun_wp_sf <- read_rds(data_path)Using freq() from funModeling, we explore the status distribution:
osun_wp_sf %>% freq(input = 'status')Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
of ggplot2 3.3.4.
ℹ The deprecated feature was likely used in the funModeling package.
Please report the issue at <https://github.com/pablo14/funModeling/issues>.

status frequency percentage cumulative_perc
1 TRUE 2642 55.5 55.5
2 FALSE 2118 44.5 100.0
Using the tmap package, we will plot the functional and non-functional water points:
tmap_mode('view')tmap mode set to interactive viewing
tm_shape(osun)+
tm_polygons(alpha=0.4)+
tm_shape(osun_wp_sf) +
tm_dots(col="status", alpha = 0.6) +
tm_view(set.zoom.limits = c(9,12))Step 2: EDA
Using the skim() from skimr package, we obtain the summary statistics as follow:
osun_wp_sf %>% skim() Warning: Couldn't find skimmers for class: sfc_POINT, sfc; No user-defined `sfl`
provided. Falling back to `character`.
| Name | Piped data |
| Number of rows | 4760 |
| Number of columns | 75 |
| _______________________ | |
| Column type frequency: | |
| character | 47 |
| logical | 5 |
| numeric | 23 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| source | 0 | 1.00 | 5 | 44 | 0 | 2 | 0 |
| report_date | 0 | 1.00 | 22 | 22 | 0 | 42 | 0 |
| status_id | 0 | 1.00 | 2 | 7 | 0 | 3 | 0 |
| water_source_clean | 0 | 1.00 | 8 | 22 | 0 | 3 | 0 |
| water_source_category | 0 | 1.00 | 4 | 6 | 0 | 2 | 0 |
| water_tech_clean | 24 | 0.99 | 9 | 23 | 0 | 3 | 0 |
| water_tech_category | 24 | 0.99 | 9 | 15 | 0 | 2 | 0 |
| facility_type | 0 | 1.00 | 8 | 8 | 0 | 1 | 0 |
| clean_country_name | 0 | 1.00 | 7 | 7 | 0 | 1 | 0 |
| clean_adm1 | 0 | 1.00 | 3 | 5 | 0 | 5 | 0 |
| clean_adm2 | 0 | 1.00 | 3 | 14 | 0 | 35 | 0 |
| clean_adm3 | 4760 | 0.00 | NA | NA | 0 | 0 | 0 |
| clean_adm4 | 4760 | 0.00 | NA | NA | 0 | 0 | 0 |
| installer | 4760 | 0.00 | NA | NA | 0 | 0 | 0 |
| management_clean | 1573 | 0.67 | 5 | 37 | 0 | 7 | 0 |
| status_clean | 0 | 1.00 | 9 | 32 | 0 | 7 | 0 |
| pay | 0 | 1.00 | 2 | 39 | 0 | 7 | 0 |
| fecal_coliform_presence | 4760 | 0.00 | NA | NA | 0 | 0 | 0 |
| subjective_quality | 0 | 1.00 | 18 | 20 | 0 | 4 | 0 |
| activity_id | 4757 | 0.00 | 36 | 36 | 0 | 3 | 0 |
| scheme_id | 4760 | 0.00 | NA | NA | 0 | 0 | 0 |
| wpdx_id | 0 | 1.00 | 12 | 12 | 0 | 4760 | 0 |
| notes | 0 | 1.00 | 2 | 96 | 0 | 3502 | 0 |
| orig_lnk | 4757 | 0.00 | 84 | 84 | 0 | 1 | 0 |
| photo_lnk | 41 | 0.99 | 84 | 84 | 0 | 4719 | 0 |
| country_id | 0 | 1.00 | 2 | 2 | 0 | 1 | 0 |
| data_lnk | 0 | 1.00 | 79 | 96 | 0 | 2 | 0 |
| water_point_history | 0 | 1.00 | 142 | 834 | 0 | 4750 | 0 |
| clean_country_id | 0 | 1.00 | 3 | 3 | 0 | 1 | 0 |
| country_name | 0 | 1.00 | 7 | 7 | 0 | 1 | 0 |
| water_source | 0 | 1.00 | 8 | 30 | 0 | 4 | 0 |
| water_tech | 0 | 1.00 | 5 | 37 | 0 | 20 | 0 |
| adm2 | 0 | 1.00 | 3 | 14 | 0 | 33 | 0 |
| adm3 | 4760 | 0.00 | NA | NA | 0 | 0 | 0 |
| management | 1573 | 0.67 | 5 | 47 | 0 | 7 | 0 |
| adm1 | 0 | 1.00 | 4 | 5 | 0 | 4 | 0 |
| New Georeferenced Column | 0 | 1.00 | 16 | 35 | 0 | 4760 | 0 |
| lat_lon_deg | 0 | 1.00 | 13 | 32 | 0 | 4760 | 0 |
| public_data_source | 0 | 1.00 | 84 | 102 | 0 | 2 | 0 |
| converted | 0 | 1.00 | 53 | 53 | 0 | 1 | 0 |
| created_timestamp | 0 | 1.00 | 22 | 22 | 0 | 2 | 0 |
| updated_timestamp | 0 | 1.00 | 22 | 22 | 0 | 2 | 0 |
| Geometry | 0 | 1.00 | 33 | 37 | 0 | 4760 | 0 |
| ADM2_EN | 0 | 1.00 | 3 | 14 | 0 | 30 | 0 |
| ADM2_PCODE | 0 | 1.00 | 8 | 8 | 0 | 30 | 0 |
| ADM1_EN | 0 | 1.00 | 4 | 4 | 0 | 1 | 0 |
| ADM1_PCODE | 0 | 1.00 | 5 | 5 | 0 | 1 | 0 |
Variable type: logical
| skim_variable | n_missing | complete_rate | mean | count |
|---|---|---|---|---|
| rehab_year | 4760 | 0 | NaN | : |
| rehabilitator | 4760 | 0 | NaN | : |
| is_urban | 0 | 1 | 0.39 | FAL: 2884, TRU: 1876 |
| latest_record | 0 | 1 | 1.00 | TRU: 4760 |
| status | 0 | 1 | 0.56 | TRU: 2642, FAL: 2118 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| row_id | 0 | 1.00 | 68550.48 | 10216.94 | 49601.00 | 66874.75 | 68244.50 | 69562.25 | 471319.00 | ▇▁▁▁▁ |
| lat_deg | 0 | 1.00 | 7.68 | 0.22 | 7.06 | 7.51 | 7.71 | 7.88 | 8.06 | ▁▂▇▇▇ |
| lon_deg | 0 | 1.00 | 4.54 | 0.21 | 4.08 | 4.36 | 4.56 | 4.71 | 5.06 | ▃▆▇▇▂ |
| install_year | 1144 | 0.76 | 2008.63 | 6.04 | 1917.00 | 2006.00 | 2010.00 | 2013.00 | 2015.00 | ▁▁▁▁▇ |
| fecal_coliform_value | 4760 | 0.00 | NaN | NA | NA | NA | NA | NA | NA | |
| distance_to_primary_road | 0 | 1.00 | 5021.53 | 5648.34 | 0.01 | 719.36 | 2972.78 | 7314.73 | 26909.86 | ▇▂▁▁▁ |
| distance_to_secondary_road | 0 | 1.00 | 3750.47 | 3938.63 | 0.15 | 460.90 | 2554.25 | 5791.94 | 19559.48 | ▇▃▁▁▁ |
| distance_to_tertiary_road | 0 | 1.00 | 1259.28 | 1680.04 | 0.02 | 121.25 | 521.77 | 1834.42 | 10966.27 | ▇▂▁▁▁ |
| distance_to_city | 0 | 1.00 | 16663.99 | 10960.82 | 53.05 | 7930.75 | 15030.41 | 24255.75 | 47934.34 | ▇▇▆▃▁ |
| distance_to_town | 0 | 1.00 | 16726.59 | 12452.65 | 30.00 | 6876.92 | 12204.53 | 27739.46 | 44020.64 | ▇▅▃▃▂ |
| rehab_priority | 2654 | 0.44 | 489.33 | 1658.81 | 0.00 | 7.00 | 91.50 | 376.25 | 29697.00 | ▇▁▁▁▁ |
| water_point_population | 4 | 1.00 | 513.58 | 1458.92 | 0.00 | 14.00 | 119.00 | 433.25 | 29697.00 | ▇▁▁▁▁ |
| local_population_1km | 4 | 1.00 | 2727.16 | 4189.46 | 0.00 | 176.00 | 1032.00 | 3717.00 | 36118.00 | ▇▁▁▁▁ |
| crucialness_score | 798 | 0.83 | 0.26 | 0.28 | 0.00 | 0.07 | 0.15 | 0.35 | 1.00 | ▇▃▁▁▁ |
| pressure_score | 798 | 0.83 | 1.46 | 4.16 | 0.00 | 0.12 | 0.41 | 1.24 | 93.69 | ▇▁▁▁▁ |
| usage_capacity | 0 | 1.00 | 560.74 | 338.46 | 300.00 | 300.00 | 300.00 | 1000.00 | 1000.00 | ▇▁▁▁▅ |
| days_since_report | 0 | 1.00 | 2692.69 | 41.92 | 1483.00 | 2688.00 | 2693.00 | 2700.00 | 4645.00 | ▁▇▁▁▁ |
| staleness_score | 0 | 1.00 | 42.80 | 0.58 | 23.13 | 42.70 | 42.79 | 42.86 | 62.66 | ▁▁▇▁▁ |
| location_id | 0 | 1.00 | 235865.49 | 6657.60 | 23741.00 | 230638.75 | 236199.50 | 240061.25 | 267454.00 | ▁▁▁▁▇ |
| cluster_size | 0 | 1.00 | 1.05 | 0.25 | 1.00 | 1.00 | 1.00 | 1.00 | 4.00 | ▇▁▁▁▁ |
| lat_deg_original | 4760 | 0.00 | NaN | NA | NA | NA | NA | NA | NA | |
| lon_deg_original | 4760 | 0.00 | NaN | NA | NA | NA | NA | NA | NA | |
| count | 0 | 1.00 | 1.00 | 0.00 | 1.00 | 1.00 | 1.00 | 1.00 | 1.00 | ▁▁▇▁▁ |
Step 3: Selecting variables
Next, we will filter away the rows with na under the columns that we are interest in. We will also recode the usage_capacity as factor instead of the numerical form. This is done in preparation for the logistic regression.
We run the code as follow:
osun_wp_sf_clean <- osun_wp_sf %>%
filter_at(vars(status,
distance_to_primary_road,
distance_to_secondary_road,
distance_to_tertiary_road,
distance_to_city,
distance_to_town,
water_point_population,
local_population_1km,
usage_capacity,
is_urban,
water_source_clean),
all_vars(!is.na(.))) %>%
mutate(usage_capacity = as.factor(usage_capacity))Next we select the columns that we want:
osun_wp <- osun_wp_sf_clean %>% select(c(7,35:39,42:43,46:47,57)) %>% st_set_geometry(NULL)Step 4: Correlation Analysis:
Next, we perform correlation analysis using cor() function of corrplot:
cluster_vars.cor = cor(osun_wp[,2:7])
corrplot.mixed(cluster_vars.cor,
lower="ellipse",
upper = "number",
tl.pos="lt",
diag="l",
tl.col="black")
From the corrplot above, we do not see any variables that are highly correlated. We can then proceed with building a simple logistic regression model
Step 5: Building a simple logistic regression model
Logistic Regression Model
Using the glm() of the R statistics, we build a logistic regression model as follow:
model <- glm(status ~ distance_to_primary_road +
distance_to_secondary_road +
distance_to_tertiary_road +
distance_to_city +
distance_to_town +
is_urban +
usage_capacity +
water_source_clean +
water_point_population +
local_population_1km,
# data = osun_wp_sf_clean,
data = osun_wp,
family = binomial(link = "logit"))Model Report
Next we use blr_regress() of blorr to provide us a report on our model:
blr_regress(model) Model Overview
------------------------------------------------------------------------
Data Set Resp Var Obs. Df. Model Df. Residual Convergence
------------------------------------------------------------------------
data status 4756 4755 4744 TRUE
------------------------------------------------------------------------
Response Summary
--------------------------------------------------------
Outcome Frequency Outcome Frequency
--------------------------------------------------------
0 2114 1 2642
--------------------------------------------------------
Maximum Likelihood Estimates
-----------------------------------------------------------------------------------------------
Parameter DF Estimate Std. Error z value Pr(>|z|)
-----------------------------------------------------------------------------------------------
(Intercept) 1 0.3887 0.1124 3.4588 5e-04
distance_to_primary_road 1 0.0000 0.0000 -0.7153 0.4744
distance_to_secondary_road 1 0.0000 0.0000 -0.5530 0.5802
distance_to_tertiary_road 1 1e-04 0.0000 4.6708 0.0000
distance_to_city 1 0.0000 0.0000 -4.7574 0.0000
distance_to_town 1 0.0000 0.0000 -4.9170 0.0000
is_urbanTRUE 1 -0.2971 0.0819 -3.6294 3e-04
usage_capacity1000 1 -0.6230 0.0697 -8.9366 0.0000
water_source_cleanProtected Shallow Well 1 0.5040 0.0857 5.8783 0.0000
water_source_cleanProtected Spring 1 1.2882 0.4388 2.9359 0.0033
water_point_population 1 -5e-04 0.0000 -11.3686 0.0000
local_population_1km 1 3e-04 0.0000 19.2953 0.0000
-----------------------------------------------------------------------------------------------
Association of Predicted Probabilities and Observed Responses
---------------------------------------------------------------
% Concordant 0.7347 Somers' D 0.4693
% Discordant 0.2653 Gamma 0.4693
% Tied 0.0000 Tau-a 0.2318
Pairs 5585188 c 0.7347
---------------------------------------------------------------
The report shows us that at 95% confidence level, all the variables are significant except distance to primary road and the distance to secondary road.
Note from class:
Categorical variable z value: +ve value implies above average correlation while -ve value implies a below average correlation.
Continuous variable z value: +ve value implies direct correlation while -ve value implies inverse correlation.
Confusion Matrix
Next, using blr_confusion_matrix() of blorr, we generate a confusion matrix as follow:
blr_confusion_matrix(model, cutoff=0.5)Confusion Matrix and Statistics
Reference
Prediction FALSE TRUE
0 1301 738
1 813 1904
Accuracy : 0.6739
No Information Rate : 0.4445
Kappa : 0.3373
McNemars's Test P-Value : 0.0602
Sensitivity : 0.7207
Specificity : 0.6154
Pos Pred Value : 0.7008
Neg Pred Value : 0.6381
Prevalence : 0.5555
Detection Rate : 0.4003
Detection Prevalence : 0.5713
Balanced Accuracy : 0.6680
Precision : 0.7008
Recall : 0.7207
'Positive' Class : 1
Notes from class:
The cutoff value of 0.5 means that any probability above 0.5 gets classified as true, while other values get classified as false. The default value we generally use is 0.5.
Sensitivity: TP/(TP/FN) >> 0.7207
Specificity: TN/(TN+FP) >> 0.6154
Accuracy: (TP+TN)/(TP+FP+TN+FN)
Since sensitivity is higher than the specificity, our model is better at catching true positives than catching true negatives.
Step 6: Spatially weighted geographical regression gwLR model
Converting to spatial data
Notes: In the past, spatial data used to be the common currency in model building. So many functions still take in spatial data. Hence the conversion is needed. The simple features dataframe only came in recently.
First we need to convert to spatial data as follow:
osun_wp_sp <- osun_wp_sf_clean %>%
select(c(status,
distance_to_primary_road,
distance_to_secondary_road,
distance_to_tertiary_road,
distance_to_city,
distance_to_town,
water_point_population,
local_population_1km,
is_urban,
usage_capacity,
water_source_clean)) %>% as_Spatial()
osun_wp_spclass : SpatialPointsDataFrame
features : 4756
extent : 182502.4, 290751, 340054.1, 450905.3 (xmin, xmax, ymin, ymax)
crs : +proj=tmerc +lat_0=4 +lon_0=8.5 +k=0.99975 +x_0=670553.98 +y_0=0 +a=6378249.145 +rf=293.465 +towgs84=-92,-93,122,0,0,0,0 +units=m +no_defs
variables : 11
names : status, distance_to_primary_road, distance_to_secondary_road, distance_to_tertiary_road, distance_to_city, distance_to_town, water_point_population, local_population_1km, is_urban, usage_capacity, water_source_clean
min values : 0, 0.014461356813335, 0.152195902540837, 0.017815121653488, 53.0461399623541, 30.0019777713073, 0, 0, 0, 1000, Borehole
max values : 1, 26909.8616132094, 19559.4793799085, 10966.2705628969, 47934.343603562, 44020.6393368124, 29697, 36118, 1, 300, Protected Spring
Determine the fixed bandwidth
Using bw.ggwr() from GWmodel package, we will next determine the fixed bandwidth:
bw.fixed <- bw.ggwr(status ~
distance_to_primary_road +
distance_to_secondary_road+
distance_to_city+
distance_to_town+
water_point_population+
local_population_1km+
is_urban+
usage_capacity+
water_source_clean,
data=osun_wp_sp,
family="binomial",
approach="AIC",
kernel = "gaussian",
adaptive = FALSE,
longlat = FALSE)bw.fixedCreating the model
using ggwr.basic() from the GWmodel, we will build the model as follow:
gwlr.fixed <- ggwr.basic(status ~
distance_to_primary_road +
distance_to_secondary_road +
distance_to_tertiary_road +
distance_to_city +
distance_to_town +
water_point_population +
local_population_1km +
is_urban +
usage_capacity +
water_source_clean,
data=osun_wp_sp,
bw=2448.701,
family = "binomial",
kernel = "gaussian",
adaptive = FALSE,
longlat = FALSE) Iteration Log-Likelihood
=========================
0 -1905
1 -1611
2 -1453
3 -1363
4 -1322
5 -1322
We get the results as follow:
gwlr.fixed ***********************************************************************
* Package GWmodel *
***********************************************************************
Program starts at: 2022-12-18 00:33:45
Call:
ggwr.basic(formula = status ~ distance_to_primary_road + distance_to_secondary_road +
distance_to_tertiary_road + distance_to_city + distance_to_town +
water_point_population + local_population_1km + is_urban +
usage_capacity + water_source_clean, data = osun_wp_sp, bw = 2448.701,
family = "binomial", kernel = "gaussian", adaptive = FALSE,
longlat = FALSE)
Dependent (y) variable: status
Independent variables: distance_to_primary_road distance_to_secondary_road distance_to_tertiary_road distance_to_city distance_to_town water_point_population local_population_1km is_urban usage_capacity water_source_clean
Number of data points: 4756
Used family: binomial
***********************************************************************
* Results of Generalized linear Regression *
***********************************************************************
Call:
NULL
Deviance Residuals:
Min 1Q Median 3Q Max
-124.555 -1.755 1.072 1.742 34.333
Coefficients:
Estimate Std. Error z value Pr(>|z|)
Intercept 3.887e-01 1.124e-01 3.459 0.000543
distance_to_primary_road -4.642e-06 6.490e-06 -0.715 0.474422
distance_to_secondary_road -5.143e-06 9.299e-06 -0.553 0.580230
distance_to_tertiary_road 9.683e-05 2.073e-05 4.671 3.00e-06
distance_to_city -1.686e-05 3.544e-06 -4.757 1.96e-06
distance_to_town -1.480e-05 3.009e-06 -4.917 8.79e-07
water_point_population -5.097e-04 4.484e-05 -11.369 < 2e-16
local_population_1km 3.451e-04 1.788e-05 19.295 < 2e-16
is_urbanTRUE -2.971e-01 8.185e-02 -3.629 0.000284
usage_capacity1000 -6.230e-01 6.972e-02 -8.937 < 2e-16
water_source_cleanProtected Shallow Well 5.040e-01 8.574e-02 5.878 4.14e-09
water_source_cleanProtected Spring 1.288e+00 4.388e-01 2.936 0.003325
Intercept ***
distance_to_primary_road
distance_to_secondary_road
distance_to_tertiary_road ***
distance_to_city ***
distance_to_town ***
water_point_population ***
local_population_1km ***
is_urbanTRUE ***
usage_capacity1000 ***
water_source_cleanProtected Shallow Well ***
water_source_cleanProtected Spring **
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 6534.5 on 4755 degrees of freedom
Residual deviance: 5688.0 on 4744 degrees of freedom
AIC: 5712
Number of Fisher Scoring iterations: 5
AICc: 5712.099
Pseudo R-square value: 0.1295351
***********************************************************************
* Results of Geographically Weighted Regression *
***********************************************************************
*********************Model calibration information*********************
Kernel function: gaussian
Fixed bandwidth: 2448.701
Regression points: the same locations as observations are used.
Distance metric: A distance matrix is specified for this model calibration.
************Summary of Generalized GWR coefficient estimates:**********
Min. 1st Qu. Median
Intercept -1.8651e+03 -5.8932e+00 2.1381e+00
distance_to_primary_road -2.5734e-02 -5.5534e-04 5.2945e-05
distance_to_secondary_road -1.4871e-02 -3.8712e-04 1.8342e-04
distance_to_tertiary_road -1.8676e-02 -5.0683e-04 1.1704e-04
distance_to_city -2.5681e-02 -6.9830e-04 -1.4487e-04
distance_to_town -2.4549e-02 -6.5984e-04 -1.9127e-04
water_point_population -9.6140e-02 -2.4721e-03 -1.0261e-03
local_population_1km -1.6277e-01 5.3119e-04 1.1204e-03
is_urbanTRUE -2.5630e+02 -4.8193e+00 -1.7628e+00
usage_capacity1000 -2.8486e+01 -1.0323e+00 -4.3651e-01
water_source_cleanProtected.Shallow.Well -9.6778e+01 -5.0858e-01 5.6050e-01
water_source_cleanProtected.Spring -7.6566e+02 -5.5969e+00 2.6067e+00
3rd Qu. Max.
Intercept 1.5360e+01 1249.9463
distance_to_primary_road 5.3165e-04 0.0262
distance_to_secondary_road 7.0105e-04 0.0401
distance_to_tertiary_road 7.7234e-04 0.0162
distance_to_city 2.8560e-04 0.0397
distance_to_town 2.3354e-04 0.0262
water_point_population 6.7288e-04 0.1505
local_population_1km 1.9569e-03 0.0393
is_urbanTRUE 1.5184e+00 932.6919
usage_capacity1000 3.1734e-01 7.7429
water_source_cleanProtected.Shallow.Well 1.8965e+00 54.7762
water_source_cleanProtected.Spring 7.1155e+00 894.8218
************************Diagnostic information*************************
Number of data points: 4756
GW Deviance: 2623.655
AIC : 4358.358
AICc : 4745.824
Pseudo R-square value: 0.5984904
***********************************************************************
Program stops at: 2022-12-18 00:34:37
A lower AIC indicated a better fit model.
Confusion Matrix
We will convert the SDF object to a dataframe first:
gwr.fixed <- as.data.frame(gwlr.fixed$SDF)Next, we will change the probability to True or False:
gwr.fixed <- gwr.fixed %>% mutate(most=ifelse(gwr.fixed$yhat >= 0.5, T, F))Using confusionMatrix() of the caret package we run the following code:
gwr.fixed$y <- as.factor(gwr.fixed$y)
gwr.fixed$most <- as.factor(gwr.fixed$most)
CM <- confusionMatrix(data=gwr.fixed$most, reference = gwr.fixed$y)
CMConfusion Matrix and Statistics
Reference
Prediction FALSE TRUE
FALSE 1843 241
TRUE 271 2401
Accuracy : 0.8923
95% CI : (0.8832, 0.901)
No Information Rate : 0.5555
P-Value [Acc > NIR] : <2e-16
Kappa : 0.7817
Mcnemar's Test P-Value : 0.2
Sensitivity : 0.8718
Specificity : 0.9088
Pos Pred Value : 0.8844
Neg Pred Value : 0.8986
Prevalence : 0.4445
Detection Rate : 0.3875
Detection Prevalence : 0.4382
Balanced Accuracy : 0.8903
'Positive' Class : FALSE
Here we see a big improvement in accuracy, sensitivity & specificity. Thus by incorporating the geographical weights, we have now obtained a better model with better excitability.
Step 7: Visualizing the results
First we will select the required columns:
osun_wp_sf_selected <- osun_wp_sf_clean %>% select(c(ADM2_EN, ADM2_PCODE,
ADM1_EN, ADM1_PCODE,
status))Next we will cbind the earlier model results to get a new dataframe for plotting:
gwr_sf.fixed <- cbind(osun_wp_sf_selected, gwr.fixed)We plot the map as follow:
tmap_mode("view")tmap mode set to interactive viewing
prob_T <- tm_shape(osun) +
tm_polygons(alpha=0.1) +
tm_shape(gwr_sf.fixed) +
tm_dots(col="yhat",
border.col = "gray60",
border.lwd = 1) +
tm_view(set.zoom.limits = c(9,14))
prob_Ttertiary_TV <- tm_shape(osun) +
tm_polygons(alpha=0.1) +
tm_shape(gwr_sf.fixed) +
tm_dots(col="distance_to_tertirary_road_TV", border.col="gray60", border.lwd=1) + tm_view(set.zoom.limits = c(8,14))tmap_arrange(tertiary_SE, tertiary_TV, asp=1, ncol=2, sync=True)